/**
 * OpenKM, Open Document Management System (http://www.openkm.com)
 * Copyright (c) 2006-2012 Paco Avila & Josep Llort
 * 
 * No bytes were intentionally harmed during the development of this application.
 * 
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License along
 * with this program; if not, write to the Free Software Foundation, Inc.,
 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 */

package com.openkm.kea.metadata;

import com.openkm.kea.filter.KEAFilter;

import java.util.List;
import java.util.ArrayList;
import java.util.Date;

import weka.core.Instances;
import weka.core.FastVector;
import weka.core.Attribute;
import weka.core.Instance;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * SubjectExtractor
 * 
 * @author jllort
 * 
 */
public class SubjectExtractor {
	private static Logger log = LoggerFactory.getLogger(SubjectExtractor.class);
	private String modelName = "model";
	private String vocabulary;
	private String vocabularyFormat = "skos";
	private String language = "en";
	private String encoding = "UTF-8";
	private boolean debug = true;
	private int subjectNumLimit = 12;
	private double subjectRelLimit = 1.2;
	private boolean additionalInfo = false;
	private KEAFilter filter = null;
	
	/**
	 * SubjectExtractor
	 */
	public SubjectExtractor() throws MetadataExtractionException {
		filter = KEAFilterBank.getFilter();
	}
	
	/**
	 * SubjectExtractor
	 */
	public SubjectExtractor(int limit) throws MetadataExtractionException {
		subjectNumLimit = limit;
		filter = KEAFilterBank.getFilter();
	}
	
	public String getModelName() {
		return modelName;
	}
	
	public void setModelName(String modelName) {
		this.modelName = modelName;
	}
	
	public String getVocabulary() {
		return vocabulary;
	}
	
	public void setVocabulary(String vocabulary) {
		this.vocabulary = vocabulary;
	}
	
	public String getVocabularyFormat() {
		return vocabularyFormat;
	}
	
	public void setVocabularyFormat(String vocabularyFormat) {
		this.vocabularyFormat = vocabularyFormat;
	}
	
	public String getLanguage() {
		return language;
	}
	
	public void setLanguage(String language) {
		this.language = language;
	}
	
	public String getEncoding() {
		return encoding;
	}
	
	public void setEncoding(String encoding) {
		this.encoding = encoding;
	}
	
	public boolean isDebug() {
		return debug;
	}
	
	public void setDebug(boolean debug) {
		this.debug = debug;
	}
	
	public int getSubjectNumLimit() {
		return subjectNumLimit;
	}
	
	public void setSubjectNumLimit(int subjectNumLimit) {
		this.subjectNumLimit = subjectNumLimit;
	}
	
	public double getSubjectRelLimit() {
		return subjectRelLimit;
	}
	
	public void setSubjectRelLimit(double subjectRelLimit) {
		this.subjectRelLimit = subjectRelLimit;
	}
	
	public boolean isAdditionalInfo() {
		return additionalInfo;
	}
	
	public void setAdditionalInfo(boolean additionalInfo) {
		this.additionalInfo = additionalInfo;
	}
	
	/**
	 * extractSuggestedSubjects
	 */
	public List<String> extractSuggestedSubjects(String documentText) {
		Date start, stop;
		
		start = new Date();
		List<String> subjects = new ArrayList<String>();
		
		// no idea what this is ....
		FastVector atts = new FastVector(3);
		atts.addElement(new Attribute("doc", (FastVector) null));
		atts.addElement(new Attribute("keyphrases", (FastVector) null));
		atts.addElement(new Attribute("filename", (String) null));
		Instances unknownDataStructure = new Instances("keyphrase_training_data", atts, 0);
		
		try {
			// this is the exrtraction process part - not too well understood yet
			// "unkowndatastructure" is called instances in original KEA code
			double[] unknownStructure = new double[2];
			unknownStructure[0] = (double) unknownDataStructure.attribute(0).addStringValue(documentText);
			unknownStructure[1] = Instance.missingValue(); // this part used for existing subjects - we have none
			unknownDataStructure.add(new Instance(1.0, unknownStructure));
			filter.input(unknownDataStructure.instance(0));
			unknownDataStructure.stringFreeStructure(); // ??**&%%!!!??
			
			// this is getting the results out - better understood
			Instance[] rankedSubjects = new Instance[this.subjectNumLimit];
			Instance subject;
			
			while ((subject = filter.output()) != null) {
				int index = (int) subject.value(filter.getRankIndex()) - 1;
				if (index < subjectNumLimit) {
					rankedSubjects[index] = subject;
				}
			}
			
			for (int i = 0; i < subjectNumLimit; i++) {
				if (rankedSubjects[i] != null) {
					subjects.add(rankedSubjects[i].stringValue(filter.getUnstemmedPhraseIndex()));
				}
			}
		} catch (Exception e) {
			log.error("problem in subject extraction: ", e);
		} finally {
			stop = new Date();
			long time = (stop.getTime() - start.getTime());
			log.info("Subject extraction completed in " + time + "ms");
		}
		
		return subjects;
	}
}
